This file contains all the relevant Python code for my final project in my Data Science III class, which replicates Wallace, Sharfstein, and Kaminsky (2019). Their paper uses k-means clustering to divide counties into 8 clusters based on their sociodemographic characteristics. My analysis attempts to recreate their findings and also considers additional clustering methods and robustness of results to the exclusion of outliers.
This Jupyter Notebook is divided intro three separate sections. The first imports the raw data and cleans it. The second does basic exploratory data analysis. The third and final section conducts the analyses using three clustering techniques: k-means, agglomerative hierarchical clustering, and density-based scanning.
# For basic data cleaning
import pandas as pd
# For working with matrices
import numpy as np
# An easy way of getting census data
import censusdata
# For extracting information from strings
import re
# For reading excel files
import xlrd
# For plotting
import seaborn as sns
import matplotlib.pyplot as plt
# For linear regression
from sklearn.linear_model import LinearRegression
# For copying dataframes
import copy
# For creating county choropleth maps
import plotly.figure_factory as ff
# For clustering
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from scipy.cluster.hierarchy import dendrogram, linkage
# Statistical packages
from scipy import stats
In this section, we import and clean raw data from the following sources:
After importing and cleaning these datasets, we combine them to form a completed dataset with each observation corresponding to a county.
We use the censusdata package to import and clean this data.
# Get data for all counties from the 2014-2018 5-year ACS.
# We download all the variables that we need to calculate the variables listed above.
county_ACS_data = censusdata.download('acs5', 2018,
censusdata.censusgeo([('state', '*'), ('county', '*')]),
# The following variables are all related to educational attainment
['B15001_011E', # Male 25-34
'B15001_015E', # 15-18 are male 25-34 with at
# least a bachelor's degree
'B15001_016E',
'B15001_017E',
'B15001_018E',
'B15001_019E', # Male 35-44
'B15001_023E', # 15-18 are male 35-44 with at
# least a bachelor's degree
'B15001_024E',
'B15001_025E',
'B15001_026E',
'B15001_052E', # Female 25-34
'B15001_056E',
'B15001_057E',
'B15001_058E',
'B15001_059E',
'B15001_060E',
'B15001_064E',
'B15001_065E',
'B15001_066E',
'B15001_067E',
# The following variables are related to labor force and unemployment
'B23025_005E',
'B23025_001E',
# The following variables are related to pop. without health insurance
'B27010_017E',
'B27010_033E',
'B27010_050E',
'B27010_002E',
'B27010_018E',
'B27010_034E',
# The following variables are related to racial composition
'B03002_003E',
'B03002_001E',
'B03002_004E',
'B03002_005E',
'B03002_012E',
# The following variable is median age
'B01002_001E',
# The following variables are related to the percent of adult pop.
# that is married
'B12001_004E',
'B12001_013E',
'B12001_001E',
# The following are total population and female population
'B01001_001E',
'B01001_026E'])
# Create a variable equal to the 5-digit county FIPS code, created by extracting this information from the
# geocode object
county_ACS_data = county_ACS_data.reset_index()
county_ACS_data = county_ACS_data.rename(columns={"index": "county_description"})
# Initialize blank integer variable corresponding to block group FIPS
county_ACS_data['County_FIPS'] = int()
# Loop over reach row and extract the 5-digit county FIPS as an integer from its FIPS codes
for i in range(0, county_ACS_data.shape[0]):
county_ACS_data.loc[i, ['County_FIPS']] = int(''.join(re.findall(r'\d+', str(county_ACS_data['county_description'][i].params()))))
# Calculate percent of population aged 25-44 with at least some college education
county_ACS_data['Percent_Some_College'] = 100*((county_ACS_data['B15001_015E'] + county_ACS_data['B15001_016E'] +
county_ACS_data['B15001_017E'] + county_ACS_data['B15001_018E'] +
county_ACS_data['B15001_023E'] + county_ACS_data['B15001_024E'] +
county_ACS_data['B15001_025E'] + county_ACS_data['B15001_026E'] +
county_ACS_data['B15001_056E'] + county_ACS_data['B15001_057E'] +
county_ACS_data['B15001_058E'] + county_ACS_data['B15001_059E'] +
county_ACS_data['B15001_064E'] + county_ACS_data['B15001_065E'] +
county_ACS_data['B15001_066E'] + county_ACS_data['B15001_067E'])/
(county_ACS_data['B15001_011E'] + county_ACS_data['B15001_019E'] +
county_ACS_data['B15001_052E'] + county_ACS_data['B15001_060E']))
# Calculate percent of population aged 16+ unemployed but seeking work
county_ACS_data['Percent_Pop_Active_LF_Unemployed'] = 100 * (county_ACS_data['B23025_005E']/county_ACS_data['B23025_001E'])
# Calculate percent of population under 65 without health insurance
county_ACS_data['Percent_Pop_Under_65_Without_Health_Insurance'] = 100 * ((county_ACS_data['B27010_017E'] +
county_ACS_data['B27010_033E'] +
county_ACS_data['B27010_050E'])/
(county_ACS_data['B27010_002E'] + county_ACS_data['B27010_018E'] + county_ACS_data['B27010_034E']))
# Calculate the four racial composition variables
county_ACS_data['Percent_Pop_Non_Hispanic_White'] = 100 * county_ACS_data['B03002_003E']/county_ACS_data['B03002_001E']
county_ACS_data['Percent_Pop_Non_Hispanic_Black'] = 100 * county_ACS_data['B03002_004E']/county_ACS_data['B03002_001E']
county_ACS_data['Percent_Pop_Non_Hispanic_Indian_Alaskan_Native'] = 100 * county_ACS_data['B03002_005E']/county_ACS_data['B03002_001E']
county_ACS_data['Percent_Pop_Hispanic'] = 100 * county_ACS_data['B03002_012E']/county_ACS_data['B03002_001E']
# Get the median age variable
county_ACS_data['Median_Age'] = county_ACS_data['B01002_001E']
# Calculate the percent of the adult population that is married
county_ACS_data['Percent_Adult_Pop_Married'] = 100 * ((county_ACS_data['B12001_004E'] +
county_ACS_data['B12001_013E'])/
(county_ACS_data['B12001_001E']))
# Get total population and calculate the percent of the population that is female
county_ACS_data['Total_Population'] = county_ACS_data['B01001_001E']
county_ACS_data['Percent_Population_Female'] = 100 * (county_ACS_data['B01001_026E'])/county_ACS_data['B01001_001E']
# Keep only the variables we need
county_ACS_data = county_ACS_data[['County_FIPS', 'Total_Population', 'Percent_Population_Female',
'Percent_Adult_Pop_Married', 'Median_Age', 'Percent_Pop_Non_Hispanic_White',
'Percent_Pop_Non_Hispanic_Black', 'Percent_Pop_Non_Hispanic_Indian_Alaskan_Native',
'Percent_Pop_Hispanic', 'Percent_Pop_Under_65_Without_Health_Insurance',
'Percent_Pop_Active_LF_Unemployed', 'Percent_Some_College']]
# Print the head of the data
county_ACS_data.head()
| County_FIPS | Total_Population | Percent_Population_Female | Percent_Adult_Pop_Married | Median_Age | Percent_Pop_Non_Hispanic_White | Percent_Pop_Non_Hispanic_Black | Percent_Pop_Non_Hispanic_Indian_Alaskan_Native | Percent_Pop_Hispanic | Percent_Pop_Under_65_Without_Health_Insurance | Percent_Pop_Active_LF_Unemployed | Percent_Some_College | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 28151 | 47086 | 53.238755 | 37.673522 | 36.9 | 25.317504 | 71.981056 | 0.118931 | 1.548231 | 21.240205 | 8.421723 | 52.924188 |
| 1 | 28111 | 12028 | 51.405055 | 53.920560 | 40.9 | 77.311274 | 19.687396 | 0.000000 | 1.504822 | 18.160734 | 3.544384 | 50.052798 |
| 2 | 28019 | 8321 | 49.837760 | 49.552326 | 44.0 | 67.455835 | 31.174138 | 0.276409 | 0.396587 | 10.667077 | 4.480500 | 54.336877 |
| 3 | 28057 | 23480 | 49.655026 | 52.550200 | 40.0 | 90.242760 | 7.061329 | 0.319421 | 1.533220 | 13.625580 | 2.184812 | 53.250940 |
| 4 | 28015 | 10129 | 49.491559 | 57.068619 | 47.0 | 64.280778 | 34.564123 | 0.059236 | 0.286307 | 10.836232 | 4.485959 | 58.349802 |
This data is downloaded from the National Historical Geographic Information System (NHGIS) from the 2010 Census, and we read it in below.
# Read in data with percent of population that is rural
percent_rural_data = pd.read_csv('nhgis0028_csv/nhgis0028_ds172_2010_county.csv', encoding='latin-1')
# Create variable equal to 5-digit county FIPS code
percent_rural_data['County_FIPS'] = 1000 * percent_rural_data['STATEA'] + percent_rural_data['COUNTYA']
# Create variable equal to the percent of the population that is rural
percent_rural_data['Percent_Rural'] = 100 * percent_rural_data['H7W005']/percent_rural_data['H7W001']
# Keep only the two variables we need
percent_rural_data = percent_rural_data[['County_FIPS', 'Percent_Rural']]
# Print the head of the data
percent_rural_data.head()
| County_FIPS | Percent_Rural | |
|---|---|---|
| 0 | 1001 | 42.002162 |
| 1 | 1003 | 42.279099 |
| 2 | 1005 | 67.789635 |
| 3 | 1007 | 68.352607 |
| 4 | 1009 | 89.951502 |
This data is downloaded from https://www.countyhealthrankings.org/explore-health-rankings/rankings-data-documentation/national-data-documentation-2010-2018.
# Read in County Health Rankings data
health_rankings_data = pd.read_excel('2016 County Health Rankings Data - v3.xls',
sheet_name ='Ranked Measure Data',
skiprows = 1,
usecols = 'A:EZ')
# Rename county FIPS variable, obesity variable, and smoker variable
health_rankings_data = health_rankings_data.rename(columns = {"FIPS" : "County_FIPS",
"% Obese" : "Percent_Adult_Pop_Obese",
"% Smokers" : "Percent_Adult_Pop_Smokers"})
# The number of driving deaths provided is over a 5-year period, so we need to divide by 5.
health_rankings_data['Annual_Driving_Deaths'] = health_rankings_data['# Driving Deaths']/5
# NA values of driving deaths correspond to zeroes, so replace accordingly
health_rankings_data['Annual_Driving_Deaths'] = health_rankings_data['Annual_Driving_Deaths'].fillna(0)
# Keep only variables we need
health_rankings_data = health_rankings_data[['County_FIPS', 'Percent_Adult_Pop_Obese',
'Percent_Adult_Pop_Smokers', 'Annual_Driving_Deaths']]
# Print the head of the data
health_rankings_data.isna().sum()
County_FIPS 2 Percent_Adult_Pop_Obese 2 Percent_Adult_Pop_Smokers 3 Annual_Driving_Deaths 0 dtype: int64
The chunk of code below merges the data together and applies a few necessary transformations. We perform an inner join, so that we only keep counties that appear in all datasets.
# First, merge ACS data with Census data on rural population
county_data = county_ACS_data.merge(percent_rural_data, on = 'County_FIPS', how = 'inner')
# Next, merge with county health rankings data
county_data = county_data.merge(health_rankings_data, on = 'County_FIPS', how = 'inner')
# Re-calculate driving deaths as per 100,000 residents
county_data['Driving_Deaths_Per_100k'] = 100000 * (county_data['Annual_Driving_Deaths']/
county_data['Total_Population'])
county_data = county_data.drop('Annual_Driving_Deaths', axis = 1)
In this section, we perform some exploratory data analysis. First, we check for missing values in the chunk of code below.
# Check for missing values
county_data.isna().sum()
County_FIPS 0 Total_Population 0 Percent_Population_Female 0 Percent_Adult_Pop_Married 0 Median_Age 0 Percent_Pop_Non_Hispanic_White 0 Percent_Pop_Non_Hispanic_Black 0 Percent_Pop_Non_Hispanic_Indian_Alaskan_Native 0 Percent_Pop_Hispanic 0 Percent_Pop_Under_65_Without_Health_Insurance 0 Percent_Pop_Active_LF_Unemployed 1 Percent_Some_College 0 Percent_Rural 0 Percent_Adult_Pop_Obese 0 Percent_Adult_Pop_Smokers 0 Driving_Deaths_Per_100k 0 dtype: int64
We can see below that there is only one variable that has any missing observations, and that variable only has one missing value. We can drop this value.
# Remove the observation with a missing value
county_data = county_data.dropna()
# Print the resulting shape and head of the data
print(county_data.shape)
(3134, 16)
We can see that we have 3,134 counties, compared to 3,139 in Wallace, Sharfstein, and Kaminsky (2019). Next, let's look at summary statistics of our data.
# Print summary statistics
county_data.drop('County_FIPS', axis = 1).describe()
| Total_Population | Percent_Population_Female | Percent_Adult_Pop_Married | Median_Age | Percent_Pop_Non_Hispanic_White | Percent_Pop_Non_Hispanic_Black | Percent_Pop_Non_Hispanic_Indian_Alaskan_Native | Percent_Pop_Hispanic | Percent_Pop_Under_65_Without_Health_Insurance | Percent_Pop_Active_LF_Unemployed | Percent_Some_College | Percent_Rural | Percent_Adult_Pop_Obese | Percent_Adult_Pop_Smokers | Driving_Deaths_Per_100k | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 3.134000e+03 | 3134.000000 | 3134.000000 | 3134.000000 | 3134.000000 | 3134.000000 | 3134.000000 | 3134.000000 | 3134.000000 | 3134.000000 | 3134.000000 | 3134.000000 | 3134.000000 | 3134.000000 | 3134.000000 |
| mean | 1.030076e+05 | 49.917500 | 53.427406 | 41.294703 | 76.587768 | 8.941634 | 1.738242 | 9.251753 | 12.192041 | 3.248671 | 57.902368 | 58.584809 | 30.943267 | 18.381270 | 22.600971 |
| std | 3.302945e+05 | 2.379855 | 6.578355 | 5.399314 | 20.074250 | 14.477160 | 7.158241 | 13.760901 | 6.012216 | 1.434644 | 11.825224 | 31.486975 | 4.461280 | 3.746685 | 25.996383 |
| min | 7.500000e+01 | 21.003945 | 21.516393 | 21.700000 | 0.727768 | 0.000000 | 0.000000 | 0.000000 | 1.948052 | 0.000000 | 15.175879 | 0.000000 | 10.700000 | 6.900000 | 0.000000 |
| 25% | 1.099075e+04 | 49.405858 | 49.691113 | 38.000000 | 64.875669 | 0.636175 | 0.125436 | 2.111982 | 7.521675 | 2.381432 | 49.855528 | 33.215524 | 28.500000 | 15.700000 | 11.280515 |
| 50% | 2.578900e+04 | 50.379531 | 54.047712 | 41.200000 | 83.998502 | 2.170781 | 0.276400 | 4.100693 | 11.088614 | 3.147290 | 57.963768 | 59.480480 | 31.200000 | 17.800000 | 17.489525 |
| 75% | 6.751500e+04 | 51.108192 | 57.668778 | 44.400000 | 92.702581 | 9.872446 | 0.667669 | 9.558570 | 15.484374 | 3.925208 | 66.482921 | 87.786979 | 33.700000 | 20.700000 | 26.370009 |
| max | 1.009805e+07 | 58.613660 | 78.798587 | 67.000000 | 100.000000 | 87.412281 | 82.479959 | 99.068770 | 49.454779 | 16.470168 | 100.000000 | 100.000000 | 46.600000 | 40.000000 | 725.075529 |
From above, we can see that many of these variables appear to be normally distributed. However, some have clear outliers. For instance, there is one county that appears to be only 21% female, roughly 12 standard deviations below the median! Let's help visualize some of these trends below and show that these outliers are mostly driven by counties with a very small population.
# Create scatterplot of driving deaths against population
plt.scatter(county_data['Total_Population'], county_data['Driving_Deaths_Per_100k'])
plt.xlabel('Total County Population (Thousands)')
plt.ylabel('Average Annual Driving Deaths \nPer 100,000 Residents')
plt.xscale('log')
plt.title('Population and Driving Deaths by U.S. County, 2014-2018')
plt.xticks([100, 1000, 10000, 100000, 1000000, 10000000], ['0.1', '1', '10', '100', '1,000', '10,000'])
plt.show()
# Create scatterplot of percent female against total population
plt.scatter(county_data['Total_Population'], county_data['Percent_Population_Female'])
plt.xlabel('Total County Population (Thousands)')
plt.ylabel('Percent of Population That is Female')
plt.title('Population and Gender Composition by U.S. County, 2014-2018')
plt.xscale('log')
plt.xticks([100, 1000, 10000, 100000, 1000000, 10000000], ['0.1', '1', '10', '100', '1,000', '10,000'])
plt.show()
Additionally, let's look at the correlation of our three health variables.
# Look at correlations of health variables
county_data[['Driving_Deaths_Per_100k', 'Percent_Adult_Pop_Smokers', 'Percent_Adult_Pop_Obese']].corr()
| Driving_Deaths_Per_100k | Percent_Adult_Pop_Smokers | Percent_Adult_Pop_Obese | |
|---|---|---|---|
| Driving_Deaths_Per_100k | 1.000000 | 0.025033 | 0.073188 |
| Percent_Adult_Pop_Smokers | 0.025033 | 1.000000 | 0.613971 |
| Percent_Adult_Pop_Obese | 0.073188 | 0.613971 | 1.000000 |
We can see that smoking and obesity are fairly highly positively correlated, which is maybe expected as both are more behavioral aspects of health. We can see that driving deaths is very uncorrelated with either, which also makes intuitive sense. Let's visualize these correlations below.
# Create pairwise scatterplot of our three dependent variables.
# Create new dataframe with renamed variables for this scatterplot.
county_data_renamed = county_data.rename(columns={"Percent_Adult_Pop_Smokers" : "Smoking Rate (%)",
"Percent_Adult_Pop_Obese" : "Obesity Rate (%)",
"Driving_Deaths_Per_100k" : "Driving Deaths per 100,000 Residents"})
pairwise_scatterplot = sns.pairplot(county_data_renamed[['Smoking Rate (%)',
'Obesity Rate (%)',
'Driving Deaths per 100,000 Residents']])
pairwise_scatterplot.fig.text(0, 1.05,'Correlations of Health Outcomes by County', fontsize=18)
Text(0, 1.05, 'Correlations of Health Outcomes by County')
With the plots above, we can see that there are definitely outliers in the data and they are primarily counties with small populations. All of these counties are included in the original data but we know that k-means clustering can be very sensitive to outliers. Thus, in the analysis below we also see how our results change when we remove counties with fewer than 10,000 residents. This threshold is somewhat arbitrary but removes virtually all of the most extreme outliers in our data.
# Create a new dataframe which removes the counties with under 10,000 residents
county_data_10k_plus = county_data[county_data.Total_Population >= 10000]
# Print number of remaining counties
county_data_10k_plus.shape
(2431, 16)
We can see that we have dropped approximately 700 out of over 3,100 counties.
Next, let's do some preprocessing of our data before we conduct our cluster analysis. Particularly, we keep only the variables we use in the clustering and for both datasets (both including and excluding small counties) and standardize these variables in both datasets.
# Create new dataframe equal only to variables we use in cluster analysis
county_data_cluster_vars = county_data.drop(['County_FIPS', 'Total_Population', 'Percent_Adult_Pop_Obese',
'Percent_Adult_Pop_Smokers', 'Driving_Deaths_Per_100k'], axis = 1)
# Create new dataframe with the same variables but only including counties with at least 10,000 pop.
county_data_10k_plus_cluster_vars = county_data_10k_plus.drop(['County_FIPS', 'Total_Population', 'Percent_Adult_Pop_Obese',
'Percent_Adult_Pop_Smokers', 'Driving_Deaths_Per_100k'], axis = 1)
# Standardize both datasets
county_data_cluster_vars = pd.DataFrame(StandardScaler().fit_transform(county_data_cluster_vars),
columns = county_data_cluster_vars.columns)
county_data_10k_plus_cluster_vars = pd.DataFrame(StandardScaler().fit_transform(county_data_10k_plus_cluster_vars),
columns = county_data_10k_plus_cluster_vars.columns)
In this section we apply five clustering methods.
We then use cluster validation methods to assess the validity of each of the five methods and determine which methods perform relatively better than others. As discussed in more detail in the final report, we only remove potential outliers from $k$-means clustering because the other clustering techniques are much less senitive to outliers.
As a first step, we simply use k-means clustering with 8 clusters and including all counties, as done in Wallace, Sharfstein, and Kaminsky (2019). We then map these clusters and compare to the map in the paper. In addition to mapping the clusters, we also create percntiles of obesity within each cluster, and across all counties, and map the difference to compare that map to Wallace, Sharfstein, and Kaminsky (2019). This is discussed in more detail in the final report.
# We repeat this analysis several times with k-means analysis so we define a function below that automatically
# conducts k means clustering and produces the two maps described above
def k_means_clustering_and_map_production(data_for_clustering, complete_data, n_clusters, random_state,
init, cluster_map_title, percentile_map_title,
cluster_map_color_scale):
'''
This function does the following: for a given number of clusters and a given dataset, it applies k-means
clustering for county data. It then produces a choropleth map of the clusters across counties. Additionally,
it assigns percentiles on obesity prevalence to each county within its cluster, and subtracts from this
percentile the overall obesity score. It then creates a choropleth map of this.
This function has the following 10 inputs:
1. data_for_clustering should be a dataframe containing only the variables used in cluster analysis. These
variables should be standardized.
2. complete_data should be a dataframe that must contain a variable called 'County_FIPS' containing the
county FIPS code, 'Percent_Adult_Pop_Obese' containing the obesity percentage by county, and any
other variables you wish to include.
3. n_clusters is the number of clusters you want to use in your k-means analysis.
4. random_state is the random state, as an integer, that you want to use in k-means analysis.
5. init is the type of initialization you want to use for k-means clustering.
6. cluster_map_title is the title you want to use for your choropleth map of the clusters.
7. percentile_map_title is the title you want to use for your map of the differences between overall county
obesity percentiles and within-cluster percentiles.
8. cluster_map_color_scale is the color scale you want to use for the choropleth map of clusters and must
have a length equal to the number of clusters.
9. cluster_map_path should be a file path indicating where you wish to save the choropleth map of clusters.
10. percentile_map_path should be a file path indicating where you wish to save the map of the
differences between overall county percentiles and within-cluster percentiles.
'''
complete_clustering_data = complete_data.copy(deep = True)
clustering_data_standardized = data_for_clustering.copy(deep = True)
# Get predictions from k-means clustering
km_clusters_predictions = KMeans(n_clusters = n_clusters, init = init, random_state = random_state).fit_predict(clustering_data_standardized)
# Put cluster predictions in complete dataset (add 1 so that cluster numbers start at 1, not 0)
complete_clustering_data['k_means_clusters'] = km_clusters_predictions + 1
# Next, rank counties in percentiles within each cluster on obesity
complete_clustering_data['cluster_percentiles_obesity_k_means'] = 100 * complete_clustering_data.groupby(['k_means_clusters']).Percent_Adult_Pop_Obese.rank(pct=True)
# Next, rank counties in percentiles across all counties on obesity
complete_clustering_data['all_county_obesity_percentiles'] = 100 * complete_clustering_data.Percent_Adult_Pop_Obese.rank(pct = True)
# Calculate the difference in percentiles (cluster percentile minus overall percentile)
complete_clustering_data['percentile_difference_cluster_vs_overall_obesity_k_means'] = complete_clustering_data['cluster_percentiles_obesity_k_means'] - complete_clustering_data['all_county_obesity_percentiles']
# Create a choropleth map of the clusters
cluster_map = ff.create_choropleth(fips = complete_clustering_data['County_FIPS'],
values = complete_clustering_data['k_means_clusters'],
title = cluster_map_title,
legend_title = 'Cluster Number',
colorscale = cluster_map_color_scale,
state_outline = {'color' : 'black'},
county_outline={'color': 'grey', 'width': 0.5})
cluster_map.show()
# Create a choropleth map of the change in percentiles by county
percentile_map = ff.create_choropleth(fips = complete_clustering_data['County_FIPS'],
values = complete_clustering_data['percentile_difference_cluster_vs_overall_obesity_k_means'],
title = percentile_map_title,
legend_title = 'Within-Cluster Percentile Minus Overall Percentile',
binning_endpoints=[-25, -10, 0, 10, 25, 50],
colorscale = ['darkblue', 'royalblue', 'lightblue', 'pink', 'orange', 'red',
'white'],
state_outline = {'color' : 'black'},
county_outline={'color': 'grey', 'width': 0.5})
percentile_map.show()
return complete_clustering_data
# Apply this function for 8 clusters, using all counties
complete_clustering_data_8_clusters = k_means_clustering_and_map_production(data_for_clustering = county_data_cluster_vars,
complete_data = county_data, n_clusters = 8,
random_state = 202012, init = 'random',
cluster_map_title = 'Clusters Obtained From k-Means Clustering with 8 Clusters, Using All Counties',
percentile_map_title = 'Cluster-Adjusted Obesity Percentile Change by County (8 Clusters), Using All Counties',
cluster_map_color_scale = ["skyblue", "royalblue", "red", "pink", "green",
"orange", "lightgreen", "yellow"])
# Print total population and total number of counties in each county
print("Number of counties in each cluster:")
print(complete_clustering_data_8_clusters.k_means_clusters.value_counts())
print("Total population in each cluster:")
print(complete_clustering_data_8_clusters.groupby(['k_means_clusters'])['Total_Population'].agg('sum'))
Number of counties in each cluster: 5 781 2 717 1 574 7 427 3 304 8 182 4 111 6 38 Name: k_means_clusters, dtype: int64 Total population in each cluster: k_means_clusters 1 10552574 2 177251518 3 34378056 4 1690810 5 27562037 6 832930 7 15591403 8 54966456 Name: Total_Population, dtype: int64
The plots and statistics above are discussed in more detail in the final report, but overall they are very similar to the findings discussed in Wallace, Sharfstein, and Kaminsky (2019). However, this takes their inclusion of all counties, their clustering method, and the number of clusters they use as given. Next, let's use SSE to determine the optimal number of clusters to use in this scenario.
In the chunks of code below we again use all counties. However, we vary the number of clusters and calculate the sum of squared errors (SSE) and average silhouette score resulting from each value, using $k=2,3,...,14,15$ clusters. We then plot both of these to select the number of clusters.
# Create a blank list for SSE scores and average silhouette scores to which we append the values for a
# given number of clusters
SSE_scores = []
silhouette_scores = []
# Loop over k=2,3,...,14,15 clusters
for n_clusters in range(2, 16):
# Calculate and append the SSE scores for this given number of clusters
SSE_scores.append(KMeans(n_clusters = n_clusters, init = 'random', random_state = 202012).fit(county_data_cluster_vars).inertia_)
# Get predicted clusters
predicted_clusters = KMeans(n_clusters = n_clusters, init = 'random', random_state = 202012).fit_predict(county_data_cluster_vars)
# Calculate and append the silhouette score for this given number of clusters
silhouette_scores.append(silhouette_score(county_data_cluster_vars, predicted_clusters))
# Plot the SSE
plt.plot(range(2,16), SSE_scores)
plt.xlabel('Number of Clusters, $k$')
plt.ylabel('Sum of Squared Errors (SSE)')
plt.suptitle('Sum of Squared Errors vs. Number of Clusters', fontsize=14, y=1)
plt.title('For $k$-Means Clustering, Including All Counties')
plt.show()
# Plot the average silhouette scores
plt.plot(range(2,16), silhouette_scores)
plt.xlabel('Number of Clusters, $k$')
plt.ylabel('Average Silhouette Score')
plt.suptitle('Average Silhouette Score vs. Number of Clusters', fontsize=14, y=1)
plt.title('For $k$-Means Clustering, Including All Counties')
plt.show()
As discussed in more detail in the final report, the two plots above lead us to choose $k=5$. Thus, we find that the authors use too many clusters. Overall, the plots provide limited evidence to choose $k=8$, although it would not be unreasonable to choose $k=7$. Below, let's recreate the above two maps with $k=5$.
# Apply function for 5 clusters, using all counties, to produce maps
complete_clustering_data_5_clusters = k_means_clustering_and_map_production(data_for_clustering = county_data_cluster_vars,
complete_data = county_data, n_clusters = 5,
random_state = 202012, init = 'random',
cluster_map_title = 'Clusters Obtained From k-Means Clustering with 5 Clusters, Using All Counties',
percentile_map_title = 'Cluster-Adjusted Obesity Percentile Change by County (5 Clusters), Using All Counties',
cluster_map_color_scale = ["red", "royalblue", "green", "orange", "yellow"])
The differences between these maps using 5 clusters and the maps obtained by the authors are discussed in more detail in the final report.
Here we repeat the analysis, but excluding counties with fewer than 10,000 residents. As discussed in more detail above and in the final report, $k$-means clustering is highly sensitive to outliers and thus it is worth investigating whether removing outliers substantially affects our results. The chunks of code below determine the optimal number of clusters only among counties with at least 10,000 residents.
# Create a blank list for SSE scores and average silhouette scores to which we append the values for a
# given number of clusters
SSE_scores = []
silhouette_scores = []
# Loop over k=2,3,...,14,15 clusters
for n_clusters in range(2, 16):
# Calculate and append the SSE scores for this given number of clusters
SSE_scores.append(KMeans(n_clusters = n_clusters, init = 'random', random_state = 202012).fit(county_data_10k_plus_cluster_vars).inertia_)
# Get predicted clusters
predicted_clusters = KMeans(n_clusters = n_clusters, init = 'random', random_state = 202012).fit_predict(county_data_10k_plus_cluster_vars)
# Calculate and append the silhouette score for this given number of clusters
silhouette_scores.append(silhouette_score(county_data_10k_plus_cluster_vars, predicted_clusters))
# Plot the SSE
plt.plot(range(2,16), SSE_scores)
plt.xlabel('Number of Clusters, $k$')
plt.ylabel('Sum of Squared Errors (SSE)')
plt.suptitle('Sum of Squared Errors vs. Number of Clusters', fontsize=14, y=1)
plt.title('For $k$-Means Clustering, Including All Counties')
plt.show()
# Plot the average silhouette scores
plt.plot(range(2,16), silhouette_scores)
plt.xlabel('Number of Clusters, $k$')
plt.ylabel('Average Silhouette Score')
plt.suptitle('Average Silhouette Score vs. Number of Clusters', fontsize=14, y=1)
plt.title('For $k$-Means Clustering, Including All Counties')
plt.show()
As discussed in more detail in the final report, the two plots above again favor using $k=5$ clusters even when removing outliers. Below, let's visualize these resulting clusters and the within-cluster obesity percentiles.
# Apply function for 5 clusters, using counties with 10k+ population, to
# determine clusters and produce maps
removing_small_counties_5_clusters = k_means_clustering_and_map_production(data_for_clustering = county_data_10k_plus_cluster_vars,
complete_data = county_data_10k_plus, n_clusters = 5,
random_state = 202012, init = 'random',
cluster_map_title = 'Clusters Obtained From k-Means Clustering with 5 Clusters, Using Counties with 10k+ Residents',
percentile_map_title = 'Cluster-Adjusted Obesity Percentile Change (5 Clusters), Using Counties with 10k+ Residents',
cluster_map_color_scale = ["red", "royalblue", "green", "orange", "yellow"])
From a visual inspection fo the maps above, excluding outliers actually does not seem to drastically alter our clusters. We will explore this in more detail in our cluster validation.
Here, we use agglomerative hierarchical clustering with the complete link method, using all counties. First, let's visualize a dendrogram to determine the optimal number of clusters.
# Define a complete clustering method
linked = linkage(county_data_cluster_vars, method = 'complete')
# Create a dendrogram plot using complete agglomerative hierarchical clustering.
plt.figure(figsize=(10, 7))
dendrogram(linked,
orientation='top',
#labels=labelList,
distance_sort='descending',
show_leaf_counts=True,
truncate_mode='lastp', # show only the last p merged clusters
p = 15)
plt.suptitle('Dendrogram Using All Counties, for Complete Link Agglomerative Hierarchical Clustering', fontsize=14, y=1)
plt.title('Truncated to Show Only Last 15 Merged Clusterrs')
plt.xlabel('Number of Counties in Each Cluster')
plt.ylabel('Distance')
plt.show()
As discussed in more detail in the final report, this leads us to choose 3 clusters. Let's repeat the process of making a map of this cluster below (we skip a percentile map as that is very uninformative in this case).
# Use a complete linkage method with 3 clusters, and predict these clusters
ahc = AgglomerativeClustering(linkage='complete', n_clusters = 3)
ahc_predicted_clusters = ahc.fit_predict(county_data_cluster_vars)
# Put cluster predictions in complete dataset (add 1 so that cluster numbers start at 1, not 0)
county_data['ahc_clusters'] = ahc_predicted_clusters + 1
# Create a choropleth map of the clusters produced by hierarchical clustering
cluster_map = ff.create_choropleth(fips = county_data['County_FIPS'],
values = county_data['ahc_clusters'],
title = 'Clusters Obtained From Hierarchical Clustering with 3 Clusters, Using All Counties',
legend_title = 'Cluster Number',
colorscale = ['skyblue', 'green', 'orange'],
county_outline={'color': 'grey', 'width': 0.5},
state_outline = {'color' : 'black'})
cluster_map.show()
From above, we can see that the clusters produced by hierarchical clustering are unlikely to be very useful in this research context, as the vast majority of counties are in the same cluster. We discuss this in more detail in the final report.
Here we apply density-based scanning to the dataset using all counties. As discussed in more detail in the final report, a main advantage of density-based scanning is that it can automatically identify and remove outliers as noise, so there is no need to "manually" drop outliers. Additionally, density-based scanning automatically determines the number of clusters.
Below, we apply density-based scanning and report the number of clusters. We will see that density-based clustering performs extremely poorly on this dataset, labelling most all data points as noise and producing only two distinct clusters.
# Use a DBSCAN algorithm with eps = 0.9 and min_samples = 12. Min_samples is chosen as the
# number of dimensions plus 1.
db = DBSCAN(eps = 0.9, min_samples = 12)
# Get the clusters from this dataset with DBSCAN
db_clusters = db.fit_predict(county_data_cluster_vars)
# Get the noise, border, and core points from this dataset
db_points = db.labels_
# Add predicted db_points to the county dataset
county_data['dbscan_point'] = db_points
# Points of -1 are noise, while 0 and 1 are the identifiers of the two separate
county_data['dbscan_label'] = 'Noise (Removed)'
county_data.dbscan_label[county_data.dbscan_point == 0] = "Cluster 1"
county_data.dbscan_label[county_data.dbscan_point == 1] = "Cluster 2"
# Create a choropleth map of the clusters produced by DBSCAN
cluster_map = ff.create_choropleth(fips = county_data['County_FIPS'],
values = county_data['dbscan_label'],
title = 'Clusters Obtained From Density-Based Clustering, Using All Counties',
legend_title = 'Cluster Type',
colorscale = ['skyblue', 'green', 'orange'],
county_outline={'color': 'grey', 'width': 0.5},
state_outline = {'color' : 'black'})
cluster_map.show()
The choropleth map above shows that density-baed clustering clearly has serious issues on this dataset, as there are only two clusters produced and the majority of counties are removed as noise.
In this final section of code, we validate the five clustering methods we applied. That is, we try to determine which perform better than others. In particular, we consider the following validation methods:
# Print silhouette scores for each of our five sets of clusters.
# K-means, all counties, k=8
print(f"Silhouette score for k-means clustering, all counties, k=8 is {round(silhouette_score(county_data_cluster_vars, complete_clustering_data_8_clusters['k_means_clusters']), 3)}.")
# K-means, all counties, k=5
print(f"Silhouette score for k-means clustering, all counties, k=5 is {round(silhouette_score(county_data_cluster_vars, complete_clustering_data_5_clusters['k_means_clusters']), 3)}.")
# K-means, only counties with 10k+ residents, k=5
print(f"Silhouette score for k-means clustering, removing small counties counties, k=5 is {round(silhouette_score(county_data_10k_plus_cluster_vars, removing_small_counties_5_clusters['k_means_clusters']), 3)}.")
# AHC, 3 clusters
print(f"Silhouette score for AHC with 3 clusters is {round(silhouette_score(county_data_cluster_vars, county_data['ahc_clusters']), 3)}.")
# DBSCAN, note that we need to remove the noise counties
print(f"Silhouette score for DBSCAN with 2 clusters is {round(silhouette_score(county_data_cluster_vars.iloc[(county_data['dbscan_point'] >= 0).values, :], county_data.dbscan_point[county_data['dbscan_point'] >= 0]), 3)}.")
Silhouette score for k-means clustering, all counties, k=8 is 0.172. Silhouette score for k-means clustering, all counties, k=5 is 0.201. Silhouette score for k-means clustering, removing small counties counties, k=5 is 0.199. Silhouette score for AHC with 3 clusters is 0.478. Silhouette score for DBSCAN with 2 clusters is 0.097.
def regression_of_health_outcomes_on_cluster_dummies(cluster_labels, outcome_data, outcome_var_list):
'''
Given a variable with cluster labels, this function runs an OLS regression of a set of outcome
variables on a set of dummy variables for the cluster labels, including a constant, and reports
the R^2 values for each regression. This function has three inputs.
1. cluster_labels should be a vector of cluster labels.
2. outcome_data should be a dataframe containing the outcome variables of interest.
3. outcome_var_list should be a list of outcome variable names in the outcome_data dataframe.
This function then prints the R^2 values from regression each outcome on the set of dummy variables.
'''
# Create copy of outcome data
outcome_data_and_vars = outcome_data.copy(deep = True)
# Create a linear regression model that includes a constant
linear_regression_model = LinearRegression(fit_intercept = True)
# Get a li
cluster_dummies = pd.get_dummies(cluster_labels)
# Loop over list of outcome vars
for outcome_var in outcome_var_list:
# Fit this regression model, then calculate the R^2
linear_regression_model.fit(cluster_dummies, outcome_data_and_vars[outcome_var])
R_squared = linear_regression_model.score(cluster_dummies,
outcome_data_and_vars[outcome_var])
# Print the R^2 value from each regression
print(f"R^2 value from regression of {outcome_var} on cluster dummies is {round(R_squared, 3)}.")
# Initialize a list of health outcomes that we want to regress on our cluster dummy variables
list_of_health_outcomes = ['Percent_Adult_Pop_Obese', 'Percent_Adult_Pop_Smokers', 'Driving_Deaths_Per_100k']
# Print the R^2 values from k-means, all counties, 8 clusters
print("R^2 values from k-means, all counties, 8 clusters:")
regression_of_health_outcomes_on_cluster_dummies(complete_clustering_data_8_clusters['k_means_clusters'], county_data, list_of_health_outcomes)
# Print the R^2 values from k-means, all counties, 5 clusters
print("R^2 values from k-means, all counties, 5 clusters:")
regression_of_health_outcomes_on_cluster_dummies(complete_clustering_data_5_clusters['k_means_clusters'], county_data, list_of_health_outcomes)
# Print the R^2 values from k-means, removing small counties, 5 clusters
print("R^2 values from k-means, removing small counties, 5 clusters:")
regression_of_health_outcomes_on_cluster_dummies(removing_small_counties_5_clusters['k_means_clusters'], county_data_10k_plus, list_of_health_outcomes)
# Print the R^2 values from AHC
print("R^2 values from AHC, 3 clusters:")
regression_of_health_outcomes_on_cluster_dummies(county_data['ahc_clusters'], county_data, list_of_health_outcomes)
# Print the R^2 values from DBSCAN, noting that we need to remove noise points
print("R^2 values from DBSCAN:")
regression_of_health_outcomes_on_cluster_dummies(county_data.dbscan_point[county_data['dbscan_point'] >= 0], county_data[county_data['dbscan_point'] >= 0], list_of_health_outcomes)
R^2 values from k-means, all counties, 8 clusters: R^2 value from regression of Percent_Adult_Pop_Obese on cluster dummies is 0.241. R^2 value from regression of Percent_Adult_Pop_Smokers on cluster dummies is 0.331. R^2 value from regression of Driving_Deaths_Per_100k on cluster dummies is 0.081. R^2 values from k-means, all counties, 5 clusters: R^2 value from regression of Percent_Adult_Pop_Obese on cluster dummies is 0.203. R^2 value from regression of Percent_Adult_Pop_Smokers on cluster dummies is 0.209. R^2 value from regression of Driving_Deaths_Per_100k on cluster dummies is 0.078. R^2 values from k-means, removing small counties, 5 clusters: R^2 value from regression of Percent_Adult_Pop_Obese on cluster dummies is 0.223. R^2 value from regression of Percent_Adult_Pop_Smokers on cluster dummies is 0.196. R^2 value from regression of Driving_Deaths_Per_100k on cluster dummies is 0.21. R^2 values from AHC, 3 clusters: R^2 value from regression of Percent_Adult_Pop_Obese on cluster dummies is 0.007. R^2 value from regression of Percent_Adult_Pop_Smokers on cluster dummies is 0.09. R^2 value from regression of Driving_Deaths_Per_100k on cluster dummies is 0.003. R^2 values from DBSCAN: R^2 value from regression of Percent_Adult_Pop_Obese on cluster dummies is 0.0. R^2 value from regression of Percent_Adult_Pop_Smokers on cluster dummies is 0.002. R^2 value from regression of Driving_Deaths_Per_100k on cluster dummies is 0.003.